Team members: Yuhong Lu, Xiaohan Mei, Ziyan Pei, Peng Yuan, Mengqing Zhang, Jiayuan Zou
library(tidyverse)
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
[37m-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.2.1 --[39m
[37m[32mv[37m [34mggplot2[37m 3.2.1 [32mv[37m [34mpurrr [37m 0.3.2
[32mv[37m [34mtibble [37m 2.1.3 [32mv[37m [34mdplyr [37m 0.8.3
[32mv[37m [34mtidyr [37m 0.8.3 [32mv[37m [34mstringr[37m 1.4.0
[32mv[37m [34mreadr [37m 1.3.1 [32mv[37m [34mforcats[37m 0.4.0[39m
[37m-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[37m [34mdplyr[37m::[32mfilter()[37m masks [34mstats[37m::filter()
[31mx[37m [34mdplyr[37m::[32mlag()[37m masks [34mstats[37m::lag()[39m
library(readr)
library(magrittr)
Attaching package: 㤼㸱magrittr㤼㸲
The following object is masked from 㤼㸱package:purrr㤼㸲:
set_names
The following object is masked from 㤼㸱package:tidyr㤼㸲:
extract
library(ggplot2)
AppleStore <- read_csv("AppleStore.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
X1 = [32mcol_double()[39m,
id = [32mcol_double()[39m,
track_name = [31mcol_character()[39m,
size_bytes = [32mcol_double()[39m,
currency = [31mcol_character()[39m,
price = [32mcol_double()[39m,
rating_count_tot = [32mcol_double()[39m,
rating_count_ver = [32mcol_double()[39m,
user_rating = [32mcol_double()[39m,
user_rating_ver = [32mcol_double()[39m,
ver = [31mcol_character()[39m,
cont_rating = [31mcol_character()[39m,
prime_genre = [31mcol_character()[39m,
sup_devices.num = [32mcol_double()[39m,
ipadSc_urls.num = [32mcol_double()[39m,
lang.num = [32mcol_double()[39m,
vpp_lic = [32mcol_double()[39m
)
descript <- read_csv("appleStore_description.csv")
Parsed with column specification:
cols(
id = [32mcol_double()[39m,
track_name = [31mcol_character()[39m,
size_bytes = [32mcol_double()[39m,
app_desc = [31mcol_character()[39m
)
AppleStore
descript
# Apple Store
AppleStore$currency <- as.factor(AppleStore$currency)
# Get primary key - ID
AppleStore %>% select(id) %>% group_by(id) %>% summarize(count=n()) %>% filter(count>1)
AppleStore %>% select(track_name) %>% group_by(track_name) %>% summarize(count=n()) %>% filter(count>1)
descript %>% select(id) %>% group_by(id) %>% summarize(count=n()) %>% filter(count>1)
descript %>% select(track_name) %>% group_by(track_name) %>% summarize(count=n()) %>% filter(count>1)
# the primary key is ID
# Merge two dataset
AppleStore %>% merge(descript, by='id') %>%
select(everything(),-X1,-size_bytes.x,-track_name.y) %>% rename(size_byte=size_bytes.y) ->AP
AP_omit <- na.omit(AP)
any(is.na(AP_omit))
[1] FALSE
write.csv(AP_omit,'AP_omit.csv')
AP_omit
NA